import os
import warnings
warnings.simplefilter(action = 'ignore', category=FutureWarning)
warnings.filterwarnings('ignore')
def ignore_warn(*args, **kwargs):
pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)
import pandas as pd
import datetime
import math
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib.cm as cm
%matplotlib inline
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())
import seaborn as sns
sns.set(style="ticks", color_codes=True, font_scale=1.5)
color = sns.color_palette()
sns.set_style('darkgrid')
from mpl_toolkits.mplot3d import Axes3D
import plotly as py
import plotly.graph_objs as go
py.offline.init_notebook_mode()
from scipy import stats
from scipy.stats import skew, norm, probplot, boxcox
from sklearn import preprocessing
import math
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
# import Orange
# from Orange.data import Domain, DiscreteVariable, ContinuousVariable
# from orangecontrib.associate.fpgrowth import *
import pandas as pd
cs_df = pd.read_csv ('../data/raw_online_retail.csv')
I created the function below to simplify the analysis of general characteristics of the data. Inspired on the str function of R, this function returns the types, counts, distinct, count nulls, missing ratio and uniques values of each field/feature.
If the study involve some supervised learning, this function can return the study of the correlation, for this we just need provide the dependent variable to the pred parameter.
Also, if its return is stored in a variable you can evaluate it in more detail, focus on specific field, or sort them from different perspectives.
cs_df.dtypes
invoiceno object stockcode object description object quantity int64 invoicedate object unitprice float64 customerid float64 country object dtype: object
cs_df['invoicedate'] = pd.to_datetime(cs_df['invoicedate'])
cs_df.dtypes
invoiceno object stockcode object description object quantity int64 invoicedate datetime64[ns] unitprice float64 customerid float64 country object dtype: object
print('Total rows:', len(cs_df))
cs_df.duplicated().sum()
Total rows: 541909
5268
cs_df = cs_df.drop_duplicates()
print('Total rows:', len(cs_df))
Total rows: 536641
cs_df.isnull().sum()
invoiceno 0 stockcode 0 description 1454 quantity 0 invoicedate 0 unitprice 0 customerid 135037 country 0 dtype: int64
def rstr(df, pred=None):
obs = df.shape[0]
types = df.dtypes
print('___________________________\nData types:\n')
print(types)
counts = df.apply(lambda x: x.count())
print('___________________________\nCounts:\n')
print(counts)
uniques = df.apply(lambda x: [x.unique()])
print('___________________________\nUniques:\n')
print(uniques)
nulls = df.apply(lambda x: x.isnull().sum())
print('___________________________\nNulls:\n')
print(nulls)
distincts = df.apply(lambda x: x.unique().shape[0])
print('___________________________\nDistincs:\n')
print(distincts)
missing_ration = (df.isnull().sum()/ obs) * 100
print('___________________________\nMissing Ratio:\n')
print(missing_ration)
skewness = df.skew()
print('___________________________\nSkewness:\n')
print(skewness)
kurtosis = df.kurt()
print('___________________________\nKurtosis:\n')
print(kurtosis)
print('Data shape:', df.shape)
details = rstr(cs_df)
# display(details.sort_values(by='missing ration', ascending=False))
___________________________
Data types:
invoiceno object
stockcode object
description object
quantity int64
invoicedate datetime64[ns]
unitprice float64
customerid float64
country object
dtype: object
___________________________
Counts:
invoiceno 536641
stockcode 536641
description 535187
quantity 536641
invoicedate 536641
unitprice 536641
customerid 401604
country 536641
dtype: int64
___________________________
Uniques:
invoiceno \
0 [536365, 536366, 536367, 536368, 536369, 53637...
stockcode \
0 [85123A, 71053, 84406B, 84029G, 84029E, 22752,...
description \
0 [WHITE HANGING HEART T-LIGHT HOLDER, WHITE MET...
quantity \
0 [6, 8, 2, 32, 3, 4, 24, 12, 48, 18, 20, 36, 80...
invoicedate \
0 [2010-12-01T08:26:00.000000000, 2010-12-01T08:...
unitprice \
0 [2.55, 3.39, 2.75, 7.65, 4.25, 1.85, 1.69, 2.1...
customerid \
0 [17850.0, 13047.0, 12583.0, 13748.0, 15100.0, ...
country
0 [United Kingdom, France, Australia, Netherland...
___________________________
Nulls:
invoiceno 0
stockcode 0
description 1454
quantity 0
invoicedate 0
unitprice 0
customerid 135037
country 0
dtype: int64
___________________________
Distincs:
invoiceno 25900
stockcode 4070
description 4224
quantity 722
invoicedate 23260
unitprice 1630
customerid 4373
country 38
dtype: int64
___________________________
Missing Ratio:
invoiceno 0.000000
stockcode 0.000000
description 0.270945
quantity 0.000000
invoicedate 0.000000
unitprice 0.000000
customerid 25.163377
country 0.000000
dtype: float64
___________________________
Skewness:
quantity -0.263921
unitprice 185.600448
customerid 0.034215
dtype: float64
___________________________
Kurtosis:
quantity 118645.544552
unitprice 58433.135911
customerid -1.179861
dtype: float64
Data shape: (536641, 8)
cs_df.describe()
| quantity | unitprice | customerid | |
|---|---|---|---|
| count | 536641.000000 | 536641.000000 | 401604.000000 |
| mean | 9.620029 | 4.632656 | 15281.160818 |
| std | 219.130156 | 97.233118 | 1714.006089 |
| min | -80995.000000 | -11062.060000 | 12346.000000 |
| 25% | 1.000000 | 1.250000 | 13939.000000 |
| 50% | 3.000000 | 2.080000 | 15145.000000 |
| 75% | 10.000000 | 4.130000 | 16784.000000 |
| max | 80995.000000 | 38970.000000 | 18287.000000 |
print('Check if we had negative quantity and prices at same register:',
'No' if cs_df[(cs_df.quantity<0) & (cs_df.unitprice<0)].shape[0] == 0 else 'Yes', '\n')
print('Check how many register we have where quantity is negative',
'and prices is 0 or vice-versa:',
cs_df[(cs_df.quantity<=0) & (cs_df.unitprice<=0)].shape[0])
print('\nWhat is the customer ID of the registers above:',
cs_df.loc[(cs_df.quantity<=0) & (cs_df.unitprice<=0),
['customerid']].customerid.unique())
print('\n% Negative Quantity: {:3.2%}'.format(cs_df[(cs_df.quantity<0)].shape[0]/cs_df.shape[0]))
print('\nAll register with negative quantity has Invoice start with:',
cs_df.loc[(cs_df.quantity<0) & ~(cs_df.customerid.isnull()), 'invoiceno'].apply(lambda x: x[0]).unique())
# print('\nSee an example of negative quantity and others related records:')
# display(cs_df[(cs_df.customerid==12472) & (cs_df.stockcode==22244)])
Check if we had negative quantity and prices at same register: No Check how many register we have where quantity is negative and prices is 0 or vice-versa: 1336 What is the customer ID of the registers above: [nan] % Negative Quantity: 1.97% All register with negative quantity has Invoice start with: ['C']
print('Check register with UnitPrice negative:')
display(cs_df[(cs_df.unitprice<0)])
print("Sales records with Customer ID and zero in Unit Price:",cs_df[(cs_df.unitprice==0) & ~(cs_df.customerid.isnull())].shape[0])
cs_df[(cs_df.unitprice==0) & ~(cs_df.customerid.isnull())]
Check register with UnitPrice negative:
| invoiceno | stockcode | description | quantity | invoicedate | unitprice | customerid | country | |
|---|---|---|---|---|---|---|---|---|
| 299983 | A563186 | B | Adjust bad debt | 1 | 2011-08-12 14:51:00 | -11062.06 | NaN | United Kingdom |
| 299984 | A563187 | B | Adjust bad debt | 1 | 2011-08-12 14:52:00 | -11062.06 | NaN | United Kingdom |
Sales records with Customer ID and zero in Unit Price: 40
| invoiceno | stockcode | description | quantity | invoicedate | unitprice | customerid | country | |
|---|---|---|---|---|---|---|---|---|
| 9302 | 537197 | 22841 | ROUND CAKE TIN VINTAGE GREEN | 1 | 2010-12-05 14:02:00 | 0.0 | 12647.0 | Germany |
| 33576 | 539263 | 22580 | ADVENT CALENDAR GINGHAM SACK | 4 | 2010-12-16 14:36:00 | 0.0 | 16560.0 | United Kingdom |
| 40089 | 539722 | 22423 | REGENCY CAKESTAND 3 TIER | 10 | 2010-12-21 13:45:00 | 0.0 | 14911.0 | EIRE |
| 47068 | 540372 | 22090 | PAPER BUNTING RETROSPOT | 24 | 2011-01-06 16:41:00 | 0.0 | 13081.0 | United Kingdom |
| 47070 | 540372 | 22553 | PLASTERS IN TIN SKULLS | 24 | 2011-01-06 16:41:00 | 0.0 | 13081.0 | United Kingdom |
| 56674 | 541109 | 22168 | ORGANISER WOOD ANTIQUE WHITE | 1 | 2011-01-13 15:10:00 | 0.0 | 15107.0 | United Kingdom |
| 86789 | 543599 | 84535B | FAIRY CAKES NOTEBOOK A6 SIZE | 16 | 2011-02-10 13:08:00 | 0.0 | 17560.0 | United Kingdom |
| 130188 | 547417 | 22062 | CERAMIC BOWL WITH LOVE HEART DESIGN | 36 | 2011-03-23 10:25:00 | 0.0 | 13239.0 | United Kingdom |
| 139453 | 548318 | 22055 | MINI CAKE STAND HANGING STRAWBERY | 5 | 2011-03-30 12:45:00 | 0.0 | 13113.0 | United Kingdom |
| 145208 | 548871 | 22162 | HEART GARLAND RUSTIC PADDED | 2 | 2011-04-04 14:42:00 | 0.0 | 14410.0 | United Kingdom |
| 157042 | 550188 | 22636 | CHILDS BREAKFAST SET CIRCUS PARADE | 1 | 2011-04-14 18:57:00 | 0.0 | 12457.0 | Switzerland |
| 187613 | 553000 | 47566 | PARTY BUNTING | 4 | 2011-05-12 15:21:00 | 0.0 | 17667.0 | United Kingdom |
| 198383 | 554037 | 22619 | SET OF 6 SOLDIER SKITTLES | 80 | 2011-05-20 14:13:00 | 0.0 | 12415.0 | Australia |
| 279324 | 561284 | 22167 | OVAL WALL MIRROR DIAMANTE | 1 | 2011-07-26 12:24:00 | 0.0 | 16818.0 | United Kingdom |
| 282912 | 561669 | 22960 | JAM MAKING SET WITH JARS | 11 | 2011-07-28 17:09:00 | 0.0 | 12507.0 | Spain |
| 285657 | 561916 | M | Manual | 1 | 2011-08-01 11:44:00 | 0.0 | 15581.0 | United Kingdom |
| 298054 | 562973 | 23157 | SET OF 6 NATIVITY MAGNETS | 240 | 2011-08-11 11:42:00 | 0.0 | 14911.0 | EIRE |
| 314745 | 564651 | 23270 | SET OF 2 CERAMIC PAINTED HEARTS | 96 | 2011-08-26 14:19:00 | 0.0 | 14646.0 | Netherlands |
| 314746 | 564651 | 23268 | SET OF 2 CERAMIC CHRISTMAS REINDEER | 192 | 2011-08-26 14:19:00 | 0.0 | 14646.0 | Netherlands |
| 314747 | 564651 | 22955 | 36 FOIL STAR CAKE CASES | 144 | 2011-08-26 14:19:00 | 0.0 | 14646.0 | Netherlands |
| 314748 | 564651 | 21786 | POLKADOT RAIN HAT | 144 | 2011-08-26 14:19:00 | 0.0 | 14646.0 | Netherlands |
| 358655 | 568158 | PADS | PADS TO MATCH ALL CUSHIONS | 1 | 2011-09-25 12:22:00 | 0.0 | 16133.0 | United Kingdom |
| 361825 | 568384 | M | Manual | 1 | 2011-09-27 09:46:00 | 0.0 | 12748.0 | United Kingdom |
| 379913 | 569716 | 22778 | GLASS CLOCHE SMALL | 2 | 2011-10-06 08:17:00 | 0.0 | 15804.0 | United Kingdom |
| 395529 | 571035 | M | Manual | 1 | 2011-10-13 12:50:00 | 0.0 | 12446.0 | RSA |
| 420404 | 572893 | 21208 | PASTEL COLOUR HONEYCOMB FAN | 5 | 2011-10-26 14:36:00 | 0.0 | 18059.0 | United Kingdom |
| 436428 | 574138 | 23234 | BISCUIT TIN VINTAGE CHRISTMAS | 216 | 2011-11-03 11:26:00 | 0.0 | 12415.0 | Australia |
| 436597 | 574175 | 22065 | CHRISTMAS PUDDING TRINKET POT | 12 | 2011-11-03 11:47:00 | 0.0 | 14110.0 | United Kingdom |
| 436961 | 574252 | M | Manual | 1 | 2011-11-03 13:24:00 | 0.0 | 12437.0 | France |
| 439361 | 574469 | 22385 | JUMBO BAG SPACEBOY DESIGN | 12 | 2011-11-04 11:55:00 | 0.0 | 12431.0 | Australia |
| 446125 | 574879 | 22625 | RED KITCHEN SCALES | 2 | 2011-11-07 13:22:00 | 0.0 | 13014.0 | United Kingdom |
| 446793 | 574920 | 22899 | CHILDREN'S APRON DOLLY GIRL | 1 | 2011-11-07 16:34:00 | 0.0 | 13985.0 | United Kingdom |
| 446794 | 574920 | 23480 | MINI LIGHTS WOODLAND MUSHROOMS | 1 | 2011-11-07 16:34:00 | 0.0 | 13985.0 | United Kingdom |
| 454463 | 575579 | 22437 | SET OF 9 BLACK SKULL BALLOONS | 20 | 2011-11-10 11:49:00 | 0.0 | 13081.0 | United Kingdom |
| 454464 | 575579 | 22089 | PAPER BUNTING VINTAGE PAISLEY | 24 | 2011-11-10 11:49:00 | 0.0 | 13081.0 | United Kingdom |
| 479079 | 577129 | 22464 | HANGING METAL HEART LANTERN | 4 | 2011-11-17 19:52:00 | 0.0 | 15602.0 | United Kingdom |
| 479546 | 577168 | M | Manual | 1 | 2011-11-18 10:42:00 | 0.0 | 12603.0 | Germany |
| 480649 | 577314 | 23407 | SET OF 2 TRAYS HOME SWEET HOME | 2 | 2011-11-18 13:23:00 | 0.0 | 12444.0 | Norway |
| 485985 | 577696 | M | Manual | 1 | 2011-11-21 11:57:00 | 0.0 | 16406.0 | United Kingdom |
| 502122 | 578841 | 84826 | ASSTD DESIGN 3D PAPER STICKERS | 12540 | 2011-11-25 15:57:00 | 0.0 | 13256.0 | United Kingdom |
As you can see, there are no records where quantity and price are negative, but there are 1.336 records where one of them is and the other is 0. However, note that for all these records we do not have the customer ID. So we conclude that we can erase all records in that quantity or the price and negative. In addition, by the foregoing summary we see that there are 135,080 records without customer identification that we may also disregard.
# Remove register without CustomerID
cs_df = cs_df[~(cs_df.customerid.isnull())]
print('Total rows:', len(cs_df))
details = rstr(cs_df)
Total rows: 401604
___________________________
Data types:
invoiceno object
stockcode object
description object
quantity int64
invoicedate datetime64[ns]
unitprice float64
customerid float64
country object
dtype: object
___________________________
Counts:
invoiceno 401604
stockcode 401604
description 401604
quantity 401604
invoicedate 401604
unitprice 401604
customerid 401604
country 401604
dtype: int64
___________________________
Uniques:
invoiceno \
0 [536365, 536366, 536367, 536368, 536369, 53637...
stockcode \
0 [85123A, 71053, 84406B, 84029G, 84029E, 22752,...
description \
0 [WHITE HANGING HEART T-LIGHT HOLDER, WHITE MET...
quantity \
0 [6, 8, 2, 32, 3, 4, 24, 12, 48, 18, 20, 36, 80...
invoicedate \
0 [2010-12-01T08:26:00.000000000, 2010-12-01T08:...
unitprice \
0 [2.55, 3.39, 2.75, 7.65, 4.25, 1.85, 1.69, 2.1...
customerid \
0 [17850.0, 13047.0, 12583.0, 13748.0, 15100.0, ...
country
0 [United Kingdom, France, Australia, Netherland...
___________________________
Nulls:
invoiceno 0
stockcode 0
description 0
quantity 0
invoicedate 0
unitprice 0
customerid 0
country 0
dtype: int64
___________________________
Distincs:
invoiceno 22190
stockcode 3684
description 3896
quantity 436
invoicedate 20460
unitprice 620
customerid 4372
country 37
dtype: int64
___________________________
Missing Ratio:
invoiceno 0.0
stockcode 0.0
description 0.0
quantity 0.0
invoicedate 0.0
unitprice 0.0
customerid 0.0
country 0.0
dtype: float64
___________________________
Skewness:
quantity 0.179957
unitprice 449.316245
customerid 0.034215
dtype: float64
___________________________
Kurtosis:
quantity 93140.274032
unitprice 243760.949868
customerid -1.179861
dtype: float64
Data shape: (401604, 8)
print(len(cs_df[(cs_df.quantity<0)]), len(cs_df['invoiceno'].str.contains('C')),
len(cs_df.loc[cs_df['quantity'] < 0 & (cs_df['invoiceno'].str.contains('C'))]) )
8872 401604 8872
# Remove negative or return transactions
# cs_df = cs_df[~(cs_df.quantity<0)]
cs_df = cs_df[cs_df.unitprice>0]
print('Total rows:', len(cs_df))
Total rows: 401564
cat_des_df = cs_df.groupby(["stockcode","description"]).count().reset_index()
display(cat_des_df.stockcode.value_counts()[cat_des_df.stockcode.value_counts()>1].reset_index().head())
cs_df[cs_df['stockcode'] == cat_des_df.stockcode.value_counts()[cat_des_df.stockcode.value_counts()>1]
.reset_index()['index'][4]]['description'].unique()
| index | stockcode | |
|---|---|---|
| 0 | 23236 | 4 |
| 1 | 23196 | 4 |
| 2 | 22776 | 3 |
| 3 | 23203 | 3 |
| 4 | 17107D | 3 |
array(["FLOWER FAIRY,5 SUMMER B'DRAW LINERS",
'FLOWER FAIRY 5 DRAWER LINERS',
'FLOWER FAIRY 5 SUMMER DRAW LINERS'], dtype=object)
unique_desc = cs_df[["stockcode", "description"]].groupby(by=["stockcode"]).\
apply(pd.DataFrame.mode).reset_index(drop=True)
q = '''
select df.invoiceno, df.stockcode, un.description, df.quantity, df.invoicedate,
df.unitprice, df.customerid, df.country
from cs_df as df INNER JOIN
unique_desc as un on df.stockcode = un.stockcode
'''
cs_df = pysqldf(q)
cs_df['cancel'] = False
cs_df.loc[cs_df['invoiceno'].str.contains('C'), 'cancel'] = True
cs_df[cs_df['invoiceno'].str.contains('C')]
| invoiceno | stockcode | description | quantity | invoicedate | unitprice | customerid | country | cancel | |
|---|---|---|---|---|---|---|---|---|---|
| 141 | C536379 | D | Discount | -1 | 2010-12-01 09:41:00.000000 | 27.50 | 14527.0 | United Kingdom | True |
| 154 | C536383 | 35004C | SET OF 3 COLOURED FLYING DUCKS | -1 | 2010-12-01 09:49:00.000000 | 4.65 | 15311.0 | United Kingdom | True |
| 235 | C536391 | 22556 | PLASTERS IN TIN CIRCUS PARADE | -12 | 2010-12-01 10:24:00.000000 | 1.65 | 17548.0 | United Kingdom | True |
| 236 | C536391 | 21984 | PACK OF 12 PINK PAISLEY TISSUES | -24 | 2010-12-01 10:24:00.000000 | 0.29 | 17548.0 | United Kingdom | True |
| 237 | C536391 | 21983 | PACK OF 12 BLUE PAISLEY TISSUES | -24 | 2010-12-01 10:24:00.000000 | 0.29 | 17548.0 | United Kingdom | True |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 401119 | C581490 | 23144 | ZINC T-LIGHT HOLDER STARS SMALL | -11 | 2011-12-09 09:57:00.000000 | 0.83 | 14397.0 | United Kingdom | True |
| 401203 | C581499 | M | Manual | -1 | 2011-12-09 10:28:00.000000 | 224.69 | 15498.0 | United Kingdom | True |
| 401370 | C581568 | 21258 | VICTORIAN SEWING BOX LARGE | -5 | 2011-12-09 11:57:00.000000 | 10.95 | 15311.0 | United Kingdom | True |
| 401371 | C581569 | 84978 | HANGING HEART JAR T-LIGHT HOLDER | -1 | 2011-12-09 11:58:00.000000 | 1.25 | 17315.0 | United Kingdom | True |
| 401372 | C581569 | 20979 | 36 PENCILS TUBE RED RETROSPOT | -5 | 2011-12-09 11:58:00.000000 | 1.25 | 17315.0 | United Kingdom | True |
8872 rows × 9 columns
# cs_df['quantity'] = np.where(cs_df['quantity'] < 0, -(cs_df['quantity']) ,cs_df['quantity'])
cs_df['amount'] = cs_df.quantity*cs_df.unitprice
cs_df.customerid = cs_df.customerid.astype('Int64')
details = rstr(cs_df)
# display(details.sort_values(by='distincts', ascending=False))
___________________________
Data types:
invoiceno object
stockcode object
description object
quantity int64
invoicedate object
unitprice float64
customerid Int64
country object
cancel bool
amount float64
dtype: object
___________________________
Counts:
invoiceno 401564
stockcode 401564
description 401564
quantity 401564
invoicedate 401564
unitprice 401564
customerid 401564
country 401564
cancel 401564
amount 401564
dtype: int64
___________________________
Uniques:
invoiceno \
0 [536365, 536366, 536367, 536368, 536369, 53637...
stockcode \
0 [85123A, 71053, 84406B, 84029G, 84029E, 22752,...
description \
0 [WHITE HANGING HEART T-LIGHT HOLDER, WHITE MET...
quantity \
0 [6, 8, 2, 32, 3, 4, 24, 12, 48, 18, 20, 36, 80...
invoicedate \
0 [2010-12-01 08:26:00.000000, 2010-12-01 08:28:...
unitprice \
0 [2.55, 3.39, 2.75, 7.65, 4.25, 1.85, 1.69, 2.1...
customerid \
0 [17850, 13047, 12583, 13748, 15100, 15291, 146...
country cancel \
0 [United Kingdom, France, Australia, Netherland... [False, True]
amount
0 [15.299999999999999, 20.34, 22.0, 15.3, 25.5, ...
___________________________
Nulls:
invoiceno 0
stockcode 0
description 0
quantity 0
invoicedate 0
unitprice 0
customerid 0
country 0
cancel 0
amount 0
dtype: int64
___________________________
Distincs:
invoiceno 22186
stockcode 3684
description 3666
quantity 435
invoicedate 20456
unitprice 619
customerid 4371
country 37
cancel 2
amount 4040
dtype: int64
___________________________
Missing Ratio:
invoiceno 0.0
stockcode 0.0
description 0.0
quantity 0.0
invoicedate 0.0
unitprice 0.0
customerid 0.0
country 0.0
cancel 0.0
amount 0.0
dtype: float64
___________________________
Skewness:
quantity -0.133165
unitprice 449.294020
customerid 0.034169
cancel 6.502681
amount -0.058014
dtype: float64
___________________________
Kurtosis:
quantity 94289.525971
unitprice 243736.782633
customerid -1.179854
cancel 40.285057
amount 122433.126656
dtype: float64
Data shape: (401564, 10)
cs_df
| invoiceno | stockcode | description | quantity | invoicedate | unitprice | customerid | country | cancel | amount | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 2010-12-01 08:26:00.000000 | 2.55 | 17850 | United Kingdom | False | 15.30 |
| 1 | 536365 | 71053 | WHITE METAL LANTERN | 6 | 2010-12-01 08:26:00.000000 | 3.39 | 17850 | United Kingdom | False | 20.34 |
| 2 | 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 2010-12-01 08:26:00.000000 | 2.75 | 17850 | United Kingdom | False | 22.00 |
| 3 | 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 2010-12-01 08:26:00.000000 | 3.39 | 17850 | United Kingdom | False | 20.34 |
| 4 | 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 2010-12-01 08:26:00.000000 | 3.39 | 17850 | United Kingdom | False | 20.34 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 401559 | 581587 | 22613 | PACK OF 20 SPACEBOY NAPKINS | 12 | 2011-12-09 12:50:00.000000 | 0.85 | 12680 | France | False | 10.20 |
| 401560 | 581587 | 22899 | CHILDREN'S APRON DOLLY GIRL | 6 | 2011-12-09 12:50:00.000000 | 2.10 | 12680 | France | False | 12.60 |
| 401561 | 581587 | 23254 | CHILDRENS CUTLERY DOLLY GIRL | 4 | 2011-12-09 12:50:00.000000 | 4.15 | 12680 | France | False | 16.60 |
| 401562 | 581587 | 23255 | CHILDRENS CUTLERY CIRCUS PARADE | 4 | 2011-12-09 12:50:00.000000 | 4.15 | 12680 | France | False | 16.60 |
| 401563 | 581587 | 22138 | BAKING SET 9 PIECE RETROSPOT | 3 | 2011-12-09 12:50:00.000000 | 4.95 | 12680 | France | False | 14.85 |
401564 rows × 10 columns
cs_df[cs_df['cancel']==True & (cs_df['quantity']>0)]
| invoiceno | stockcode | description | quantity | invoicedate | unitprice | customerid | country | cancel | amount |
|---|
cs_df.to_csv('../data/cleaned_online_retail.csv')